

# ################################################
# (3) Hybrid likelihood batch prediction problem 
# ################################################

get_pi <- function(m, lambda, dat){
 n = nrow(dat)
 p = 1/(n + n*lambda*m)
 return(p)
}

get_lambda <- function(m, dat){
 
 fun <- function(lambda) sum(m/(1 + lambda*m))
 
 n = nrow(dat)
 min_m = min(m)
 max_m = max(m)
 
 if (min_m*max_m < 0){
  lb = (1/n - 1)/max_m
  ub = (1/n - 1)/min_m
 }else if(min_m > 0){ 
  cat("positive \n")
  lb = (1/n - 1)/max_m
  ub = Inf
 }else{ # max_m < 0
  cat("negative \n")
  lb = -Inf
  ub = (1/n - 1)/min_m
 }
 interval = c(lb, ub)
 lambda = uniroot(fun, interval)$root
 return(lambda)
}

get_m <- function(beta, dat, Xm, Xl, Xy){
 
 n = nrow(dat)
 
 # beta_y = beta$beta_y
 # beta_l = beta$beta_l
 # beta_m = beta$beta_m
 beta_m = beta[1:(ncol(Xm))]
 beta_l = beta[(ncol(Xm)+1):(ncol(Xm)+ncol(Xl))]
 beta_y = beta[(ncol(Xm)+ncol(Xl)+1):length(beta)]
 names(beta_m) = colnames(Xm)
 names(beta_l) = colnames(Xl)
 names(beta_y) = colnames(Xy)
 
 dat_a0m0l0 = process_data(dat, a = 0, m = 0, l = 0)
 dat_a0m0l1 = process_data(dat, a = 0, m = 0, l = 1)
 dat_a0m1l0 = process_data(dat, a = 0, m = 1, l = 0)
 dat_a0m1l1 = process_data(dat, a = 0, m = 1, l = 1)
 dat_a1m0l0 = process_data(dat, a = 1, m = 0, l = 0)
 dat_a1m0l1 = process_data(dat, a = 1, m = 0, l = 1)
 dat_a1m1l0 = process_data(dat, a = 1, m = 1, l = 0)
 dat_a1m1l1 = process_data(dat, a = 1, m = 1, l = 1)
 
 dat_a0m0 = process_data(dat, a = 0, m = 0, l = dat$L)
 dat_a0m1 = process_data(dat, a = 0, m = 1, l = dat$L)
 dat_a1m0 = process_data(dat, a = 1, m = 0, l = dat$L)
 dat_a1m1 = process_data(dat, a = 1, m = 1, l = dat$L)
 
 dat_a0 = process_data(dat, a = 0, m = dat$M, l = dat$L)
 dat_a1 = process_data(dat, a = 1, m = dat$M, l = dat$L)
 
 # p(L | M, A, C)
 idx_l = match(attributes(beta_l)$names, colnames(dat))
 p_l1a0m0 = 1/(1 + exp(-dat_a0m0[, idx_l]%*%beta_l))
 p_l0a0m0 = 1 - p_l1a0m0
 p_l1a0m1 = 1/(1 + exp(-dat_a0m1[, idx_l]%*%beta_l))
 p_l0a0m1 = 1 - p_l1a0m1
 p_l1a1m0 = 1/(1 + exp(-dat_a1m0[, idx_l]%*%beta_l))
 p_l0a1m0 = 1 - p_l1a1m0
 p_l1a1m1 = 1/(1 + exp(-dat_a1m1[, idx_l]%*%beta_l))
 p_l0a1m1 = 1 - p_l1a1m1
 
 # p(M | A, C)
 idx_m = match(attributes(beta_m)$names, colnames(dat))
 p_m1a0 = 1/(1 + exp(-dat_a0[, idx_m]%*%beta_m))
 p_m0a0 = 1 - p_m1a0
 p_m1a1 = 1/(1 + exp(-dat_a1[, idx_m]%*%beta_m))
 p_m0a1 = 1 - p_m1a1

 idx_y = match(attributes(beta_y)$names, colnames(dat))
 y_a0m0l0 = dat_a0m0l0[, idx_y]%*%beta_y
 y_a0m0l1 = dat_a0m0l1[, idx_y]%*%beta_y
 y_a0m1l0 = dat_a0m1l0[, idx_y]%*%beta_y
 y_a0m1l1 = dat_a0m1l1[, idx_y]%*%beta_y
 y_a1m0l0 = dat_a1m0l0[, idx_y]%*%beta_y
 y_a1m0l1 = dat_a1m0l1[, idx_y]%*%beta_y
 y_a1m1l0 = dat_a1m1l0[, idx_y]%*%beta_y
 y_a1m1l1 = dat_a1m1l1[, idx_y]%*%beta_y
  
 m = ( (y_a1m0l0*p_m0a1 - y_a0m0l0*p_m0a0)*p_l0a0m0 + 
        (y_a1m0l1*p_m0a1 - y_a0m0l1*p_m0a0 )*p_l1a0m0 + 
        (y_a1m1l0*p_m1a1 - y_a0m1l0*p_m1a0)*p_l0a0m1 + 
        (y_a1m1l1*p_m1a1 - y_a0m1l1*p_m1a0 )*p_l1a0m1 )
 
 return(m)
}


get_negloglik <- function(beta, dat, Xm, Xl, Xy){
 n = nrow(dat)
 m = get_m(beta, dat, Xm, Xl, Xy)
 lambda = get_lambda(m, dat)
 beta_m = beta[1:(ncol(Xm))]
 beta_l = beta[(ncol(Xm)+1):(ncol(Xm)+ncol(Xl))]
 beta_y = beta[(ncol(Xm)+ncol(Xl)+1):length(beta)]

 p_X = get_pi(m, lambda, dat)
 
 Y = dat$Y
 Y_hat = Xy%*%beta_y
 p_Y = dnorm(Y, Y_hat, 1)
 
 p_L1 = 1/(1+exp(-Xl%*%beta_l))
 p_L = L*p_L1 + (1-L)*(1-p_L1)
 
 p_M1 = 1/(1+exp(-Xm%*%beta_m))
 p_M = M*p_M1 + (1-M)*(1-p_M1)
 
 loglik = sum( log(p_X) + log(p_Y) + log(p_M) + log(p_L))
 
 # loglik = sum( - log(1 + lambda*m) 
 #               - (Y - Y_hat)^2/2 
 #               - dat$M*log(1 + exp(-Xm%*%beta_m)) - (1-dat$M)*log(1 + exp(Xm%*%beta_m)))
 return(-loglik)
}

get_gradient_negloglik <- function(beta, dat, Xm, Xl, Xy, delta){
 jacob = c()
 for (i in 1:length(beta)){
  beta_tmp_l = beta
  beta_tmp_u = beta
  beta_tmp_l[i] = beta[i] - delta
  beta_tmp_u[i] = beta[i] + delta
  grad = (get_negloglik(beta_tmp_u, dat, Xm, Xl, Xy) - get_negloglik(beta_tmp_l, dat, Xm, Xl, Xy))/(2*delta)
  jacob = c(jacob, grad)
 }
 return(jacob)
}

optimize_hybrid <- function(beta, dat, fmla, opt){
 
 alpha = opt$alpha
 threshold = opt$threshold
 max_iter = opt$max_iter
 delta = opt$delta
 
 fmla_m = fmla$fmla_m 
 fmla_l = fmla$fmla_l 
 fmla_y = fmla$fmla_y

 Xm = as.matrix(model.matrix(fmla_m, data=model.frame(dat, na.action = NULL)))
 Xl = as.matrix(model.matrix(fmla_l, data=model.frame(dat, na.action = NULL)))
 Xy = as.matrix(model.matrix(fmla_y, data=model.frame(dat, na.action = NULL)))
 names(beta) = c(colnames(Xm), colnames(Xl), colnames(Xy))

 for (i in 1:max_iter){
  cat(i, "\n")
  lr = alpha/i
  grad = get_gradient_negloglik(beta, dat, Xm, Xl, Xy, delta)
  beta_next = beta - lr*grad
  
  if (all(abs(beta_next - beta) < threshold) | all(abs(grad) < threshold)) break
  
  # added
  m = get_m(beta_next, dat, Xm, Xl, Xy)
  lambda = get_lambda(m, dat)
  pi = get_pi(m, lambda, dat)
  nde = sum(m*pi)
  cat("beta diff = ", abs(beta_next - beta), "\n")
  cat("grad diff = ", abs(grad) < threshold, "\n")
  cat("nde = ", nde, "\n")
  # if(nde < 0.0001) break
  # end added
  
  beta = beta_next
  if (i == max_iter) cat("reached maximum number of iterations without convergence \n")
 }
 
 beta_m = beta[1:(ncol(Xm))]
 beta_l = beta[(ncol(Xm)+1):(ncol(Xm)+ncol(Xl))]
 beta_y = beta[(ncol(Xm)+ncol(Xl)+1):length(beta)]
 names(beta_m) = colnames(Xm)
 names(beta_l) = colnames(Xl)
 names(beta_y) = colnames(Xy)
 
 m = get_m(beta, dat, Xm, Xl, Xy)
 lambda = get_lambda(m, dat)
 pi = get_pi(m, lambda, dat)
 
 nde = sum(m*pi)
 
 Y_hat = Xy%*%beta_y
 
 return(list(beta_m=beta_m, 
             beta_l=beta_l, 
             beta_y=beta_y, 
             m=m, 
             lambda=lambda, 
             px=pi, 
             Y_hat=Y_hat, 
             nde = nde))
}
 

 
